I created several linear regression models using temperature as the dependent variable.
temp_lowc <- lm(temperature ~ cloudlow,data = combine)
temp_lowc %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 277.7132586 2.7138125 102.33325 5.691620e-78
## 2 cloudlow 0.6569496 0.1328456 4.94521 5.022951e-06
lowcg <- ggplot(combine,aes(x=cloudlow,y=temperature))+geom_point()+
xlab("Low Cloud Coverage")+ylab("Temperature")+
geom_abline(intercept=277.7133,slope=0.6569,col="red")
lowcg
temp_midc <- lm(temperature ~ cloudmid,data = combine)
temp_midc %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 313.376097 2.2739709 137.81008 5.642312e-87
## 2 cloudmid -1.167782 0.1110144 -10.51919 4.670486e-16
midcg <- ggplot(combine,aes(x=cloudmid,y=temperature))+geom_point()+
xlab("Middle Cloud Coverage")+ylab("Temperature")+
geom_abline(intercept=313.376,slope=-1.168,col="red")
midcg
temp_highc <- lm(temperature ~ cloudhigh,data = combine)
temp_highc %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 298.8933145 1.7554467 170.266243 2.193430e-93
## 2 cloudhigh -0.8519088 0.1541152 -5.527739 5.221285e-07
highcg <- ggplot(combine,aes(x=cloudhigh,y=temperature))+geom_point()+
xlab("High Cloud Coverage")+ylab("Temperature")+
geom_abline(intercept=298.8933,slope=-0.8519,col="red")
highcg
temp_ozone <- lm(temperature ~ ozone,data = combine)
temp_ozone %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 337.8292030 12.16953566 27.760238 1.634629e-39
## 2 ozone -0.1525832 0.03904494 -3.907888 2.124625e-04
ozoneg <- ggplot(combine,aes(x=ozone,y=temperature))+geom_point()+
xlab("Ozone Level")+ylab("Temperature")+
geom_abline(intercept=337.8292,slope=-0.1526,col="red")
ozoneg
temp_surftemp <- lm(temperature ~ surftemp,data = combine)
temp_surftemp %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 86.0301279 10.9520922 7.855132 3.376424e-11
## 2 surftemp 0.7002365 0.0374963 18.674818 6.705231e-29
surfg <- ggplot(combine,aes(x=surftemp,y=temperature))+geom_point()+
xlab("Surface Temperature")+ylab("Temperature")+
geom_abline(intercept=86.0301,slope=0.7002,col="red")
surfg
temp_pres <- lm(temperature ~ pressure,data = combine)
temp_pres %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 259.84483680 20.06505232 12.950120 2.938691e-20
## 2 pressure 0.03408459 0.02234757 1.525203 1.317141e-01
presg <- ggplot(combine,aes(x=pressure,y=temperature))+geom_point()+
xlab("Atmospheric Pressure")+ylab("Temperature")+
geom_abline(intercept=259.84484,slope=0.03408,col="red")
presg
Used to combine all graphs into one figure.
figure <- ggarrange(lowcg,midcg,highcg,ozoneg,surfg,presg ,ncol = 3,nrow=2)
figure
From the linear regressions, pressure was the only variable that did not correlate with temperature. Therefore, the multiple linear regression model will not use that variable for predictions.
model <- lm(temperature ~ cloudlow+cloudmid+cloudhigh+ozone+surftemp,data=combine)
summary(model)
##
## Call:
## lm(formula = temperature ~ cloudlow + cloudmid + cloudhigh +
## ozone + surftemp, data = combine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7001 -1.7232 -0.0064 1.7982 4.8737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.43522 22.89516 0.893 0.375
## cloudlow -0.65061 0.10505 -6.194 4.27e-08 ***
## cloudmid 0.16998 0.10875 1.563 0.123
## cloudhigh -0.43951 0.08269 -5.315 1.35e-06 ***
## ozone 0.01669 0.02003 0.833 0.408
## surftemp 0.95383 0.06855 13.915 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.427 on 66 degrees of freedom
## Multiple R-squared: 0.9261, Adjusted R-squared: 0.9205
## F-statistic: 165.3 on 5 and 66 DF, p-value: < 2.2e-16
Chose 50 random data points from the NASA data set (some listed below):
temp_pred <- sample_n(dfnasa,50)
head(temp_pred)
## lat long month year cloudhigh cloudlow cloudmid ozone
## 1 28.713043 -66.21739 5 2000 5.0 27.5 6.5 324
## 2 21.226087 -91.26087 2 1999 0.5 15.5 3.5 262
## 3 -8.721739 -86.25217 10 1995 0.5 67.0 12.5 266
## 4 3.756522 -83.74783 9 1998 18.5 36.0 23.5 264
## 5 -16.208696 -86.25217 1 2000 0.5 50.5 15.0 264
## 6 8.747826 -73.73043 11 1996 28.0 9.0 28.5 246
## pressure surftemp temperature
## 1 1000 296.0 296.9
## 2 1000 296.5 298.7
## 3 1000 290.2 294.6
## 4 1000 296.5 300.5
## 5 1000 293.2 296.5
## 6 995 295.0 301.0
Data frame of 50 random rows from the NASA data set.
model_usage <- temp_pred %>% select(cloudhigh,cloudlow,cloudmid,ozone,surftemp)
real_temp <- temp_pred %>% select(temperature)
head(model_usage)
## cloudhigh cloudlow cloudmid ozone surftemp
## 1 5.0 27.5 6.5 324 296.0
## 2 0.5 15.5 3.5 262 296.5
## 3 0.5 67.0 12.5 266 290.2
## 4 18.5 36.0 23.5 264 296.5
## 5 0.5 50.5 15.0 264 293.2
## 6 28.0 9.0 28.5 246 295.0
The model_usage variable was used to find the prediction while storing the actual temperature in real_temp.
model_predictions <- model_usage %>% add_predictions(model)
head(model_predictions)
## cloudhigh cloudlow cloudmid ozone surftemp pred
## 1 5.0 27.5 6.5 324 296.0 289.1908
## 2 0.5 15.5 3.5 262 296.5 297.9084
## 3 0.5 67.0 12.5 266 290.2 259.9896
## 4 18.5 36.0 23.5 264 296.5 280.0929
## 5 0.5 50.5 15.0 264 293.2 273.9777
## 6 28.0 9.0 28.5 246 295.0 292.6028
actual_preddf <- data.frame(cbind(real_temp, model_predictions$pred))
colnames(actual_preddf) = c("real","prediction")
ggplotly(ggplot(actual_preddf)+geom_point(aes(x=real,y=prediction))+
geom_abline(intercept=0,slope=1,col="darkturquoise",size=1)+
xlab("Real Temperature")+ylab("Predicted Temperature"))